

<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
  <meta charset="utf-8" />
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  
  <title>互联 &mdash; Ceph Documentation</title>
  

  
  <link rel="stylesheet" href="../../_static/ceph.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/graphviz.css" type="text/css" />
  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />

  
  
    <link rel="shortcut icon" href="../../_static/favicon.ico"/>
  

  
  

  

  
  <!--[if lt IE 9]>
    <script src="../../_static/js/html5shiv.min.js"></script>
  <![endif]-->
  
    
      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
        <script src="../../_static/jquery.js"></script>
        <script src="../../_static/underscore.js"></script>
        <script src="../../_static/doctools.js"></script>
    
    <script type="text/javascript" src="../../_static/js/theme.js"></script>

    
    <link rel="index" title="Index" href="../../genindex/" />
    <link rel="search" title="Search" href="../../search/" />
    <link rel="next" title="Using perf" href="../perf/" />
    <link rel="prev" title="OSD class path issues" href="../osd-class-path/" /> 
</head>

<body class="wy-body-for-nav">

   
  <header class="top-bar">
    

















<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="wy-breadcrumbs">
    
      <li><a href="../../" class="icon icon-home"></a> &raquo;</li>
        
          <li><a href="../internals/">Ceph 内幕</a> &raquo;</li>
        
      <li>互联</li>
    
    
      <li class="wy-breadcrumbs-aside">
        
          
            <a href="../../_sources/dev/peering.rst.txt" rel="nofollow"> View page source</a>
          
        
      </li>
    
  </ul>

  
  <hr/>
</div>
  </header>
  <div class="wy-grid-for-nav">
    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search"  style="background: #eee" >
          

          
            <a href="../../">
          

          
            
            <img src="../../_static/logo.png" class="logo" alt="Logo"/>
          
          </a>

          

          
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../../search/" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          
        </div>

        
        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
              
            
            
              <ul class="current">
<li class="toctree-l1"><a class="reference internal" href="../../start/intro/">Ceph 简介</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../install/">安装 Ceph</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../cephadm/">Cephadm</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../rados/">Ceph 存储集群</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../cephfs/">Ceph 文件系统</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../rbd/">Ceph 块设备</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../radosgw/">Ceph 对象网关</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../mgr/">Ceph 管理器守护进程</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../mgr/dashboard/">Ceph 仪表盘</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../api/">API 文档</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../architecture/">体系结构</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developer_guide/">开发者指南</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="../internals/">Ceph 内幕</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="../blkin/">Tracing Ceph With LTTng</a></li>
<li class="toctree-l2"><a class="reference internal" href="../blkin/#tracing-ceph-with-blkin">Tracing Ceph With Blkin</a></li>
<li class="toctree-l2"><a class="reference internal" href="../bluestore/">BlueStore Internals</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cache-pool/">Cache pool</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ceph_krb_auth/">如何配置好 Ceph Kerberos 认证的详细文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cephfs-mirroring/">CephFS Mirroring</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cephfs-reclaim/">CephFS Reclaim Interface</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cephfs-snapshots/">CephFS 快照</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cephx/">Cephx</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cephx_protocol/">Cephx 认证协议详细阐述</a></li>
<li class="toctree-l2"><a class="reference internal" href="../config/">配置管理系统</a></li>
<li class="toctree-l2"><a class="reference internal" href="../config-key/">config-key layout</a></li>
<li class="toctree-l2"><a class="reference internal" href="../context/">CephContext</a></li>
<li class="toctree-l2"><a class="reference internal" href="../continuous-integration/">Continuous Integration Architecture</a></li>
<li class="toctree-l2"><a class="reference internal" href="../corpus/">资料库结构</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cpu-profiler/">Oprofile 的安装</a></li>
<li class="toctree-l2"><a class="reference internal" href="../cxx/">C++17 and libstdc++ ABI</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deduplication/">去重</a></li>
<li class="toctree-l2"><a class="reference internal" href="../delayed-delete/">CephFS delayed deletion</a></li>
<li class="toctree-l2"><a class="reference internal" href="../dev_cluster_deployement/">开发集群的部署</a></li>
<li class="toctree-l2"><a class="reference internal" href="../dev_cluster_deployement/#id5">在同一机器上部署多套开发集群</a></li>
<li class="toctree-l2"><a class="reference internal" href="../development-workflow/">开发流程</a></li>
<li class="toctree-l2"><a class="reference internal" href="../documenting/">为 Ceph 写作文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../encoding/">序列化（编码、解码）</a></li>
<li class="toctree-l2"><a class="reference internal" href="../erasure-coded-pool/">纠删码存储池</a></li>
<li class="toctree-l2"><a class="reference internal" href="../file-striping/">File striping</a></li>
<li class="toctree-l2"><a class="reference internal" href="../freebsd/">FreeBSD Implementation details</a></li>
<li class="toctree-l2"><a class="reference internal" href="../generatedocs/">Ceph 文档的构建</a></li>
<li class="toctree-l2"><a class="reference internal" href="../health-reports/">Health Reports</a></li>
<li class="toctree-l2"><a class="reference internal" href="../iana/">IANA 号</a></li>
<li class="toctree-l2"><a class="reference internal" href="../kubernetes/">Hacking on Ceph in Kubernetes with Rook</a></li>
<li class="toctree-l2"><a class="reference internal" href="../libs/">库体系结构</a></li>
<li class="toctree-l2"><a class="reference internal" href="../logging/">集群日志的用法</a></li>
<li class="toctree-l2"><a class="reference internal" href="../logs/">调试日志</a></li>
<li class="toctree-l2"><a class="reference internal" href="../macos/">在 MacOS 上构建</a></li>
<li class="toctree-l2"><a class="reference internal" href="../messenger/">Messenger notes</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mon-bootstrap/">Monitor bootstrap</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mon-elections/">Monitor Elections</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mon-on-disk-formats/">ON-DISK FORMAT</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mon-osdmap-prune/">FULL OSDMAP VERSION PRUNING</a></li>
<li class="toctree-l2"><a class="reference internal" href="../msgr2/">msgr2 协议（ msgr2.0 和 msgr2.1 ）</a></li>
<li class="toctree-l2"><a class="reference internal" href="../network-encoding/">Network Encoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../network-protocol/">网络协议</a></li>
<li class="toctree-l2"><a class="reference internal" href="../object-store/">对象存储架构概述</a></li>
<li class="toctree-l2"><a class="reference internal" href="../osd-class-path/">OSD class path issues</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">互联</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#id2">概念</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id3">互联过程描述</a></li>
<li class="toctree-l3"><a class="reference internal" href="#id4">状态机模型</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="../perf/">Using perf</a></li>
<li class="toctree-l2"><a class="reference internal" href="../perf_counters/">性能计数器</a></li>
<li class="toctree-l2"><a class="reference internal" href="../perf_histograms/">Perf histograms</a></li>
<li class="toctree-l2"><a class="reference internal" href="../placement-group/">PG （归置组）说明</a></li>
<li class="toctree-l2"><a class="reference internal" href="../quick_guide/">开发者指南（快速）</a></li>
<li class="toctree-l2"><a class="reference internal" href="../rados-client-protocol/">RADOS 客户端协议</a></li>
<li class="toctree-l2"><a class="reference internal" href="../rbd-diff/">RBD 增量备份</a></li>
<li class="toctree-l2"><a class="reference internal" href="../rbd-export/">RBD Export &amp; Import</a></li>
<li class="toctree-l2"><a class="reference internal" href="../rbd-layering/">RBD Layering</a></li>
<li class="toctree-l2"><a class="reference internal" href="../release-checklists/">Release checklists</a></li>
<li class="toctree-l2"><a class="reference internal" href="../release-process/">Ceph Release Process</a></li>
<li class="toctree-l2"><a class="reference internal" href="../seastore/">SeaStore</a></li>
<li class="toctree-l2"><a class="reference internal" href="../sepia/">Sepia 社区测试实验室</a></li>
<li class="toctree-l2"><a class="reference internal" href="../session_authentication/">Session Authentication for the Cephx Protocol</a></li>
<li class="toctree-l2"><a class="reference internal" href="../testing/">测试笔记</a></li>
<li class="toctree-l2"><a class="reference internal" href="../versions/">Public OSD Version</a></li>
<li class="toctree-l2"><a class="reference internal" href="../vstart-ganesha/">NFS CephFS-RGW Developer Guide</a></li>
<li class="toctree-l2"><a class="reference internal" href="../wireshark/">Wireshark Dissector</a></li>
<li class="toctree-l2"><a class="reference internal" href="../zoned-storage/">Zoned Storage Support</a></li>
<li class="toctree-l2"><a class="reference internal" href="../osd_internals/">OSD 开发者文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../mds_internals/">MDS 开发者文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../radosgw/">RADOS 网关开发者文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../ceph-volume/">ceph-volume 开发者文档</a></li>
<li class="toctree-l2"><a class="reference internal" href="../crimson/">Crimson developer documentation</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="../../governance/">项目管理</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../foundation/">Ceph 基金会</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../ceph-volume/">ceph-volume</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../releases/general/">Ceph 版本（总目录）</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../releases/">Ceph 版本（索引）</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../security/">Security</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../glossary/">Ceph 术语</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../jaegertracing/">Tracing</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../translation_cn/">中文版翻译资源</a></li>
</ul>

            
          
        </div>
        
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">

      
      <nav class="wy-nav-top" aria-label="top navigation">
        
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../../">Ceph</a>
        
      </nav>


      <div class="wy-nav-content">
        
        <div class="rst-content">
        
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
<div id="dev-warning" class="admonition note">
  <p class="first admonition-title">Notice</p>
  <p class="last">This document is for a development version of Ceph.</p>
</div>
  <div id="docubetter" align="right" style="padding: 5px; font-weight: bold;">
    <a href="https://pad.ceph.com/p/Report_Documentation_Bugs">Report a Documentation Bug</a>
  </div>

  
  <div class="section" id="id1">
<h1>互联<a class="headerlink" href="#id1" title="Permalink to this headline">¶</a></h1>
<div class="section" id="id2">
<h2>概念<a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h2>
<dl class="simple">
<dt><em>Peering</em></dt><dd><p>the process of bringing all of the OSDs that store
a Placement Group (PG) into agreement about the state
of all of the objects (and their metadata) in that PG.
Note that agreeing on the state does not mean that
they all have the latest contents.</p>
</dd>
<dt><em>Acting set</em></dt><dd><p>the ordered list of OSDs who are (or were as of some epoch)
responsible for a particular PG.</p>
</dd>
<dt><em>Up set</em></dt><dd><p>the ordered list of OSDs responsible for a particular PG for
a particular epoch according to CRUSH.  Normally this
is the same as the <em>acting set</em>, except when the <em>acting set</em> has been
explicitly overridden via <em>PG temp</em> in the OSDMap.</p>
</dd>
<dt><em>PG temp</em></dt><dd><p>a temporary placement group acting set used while backfilling the
primary osd. Let say acting is [0,1,2] and we are
active+clean. Something happens and acting is now [3,1,2]. osd 3 is
empty and can’t serve reads although it is the primary. osd.3 will
see that and request a <em>PG temp</em> of [1,2,3] to the monitors using a
MOSDPGTemp message so that osd.1 temporarily becomes the
primary. It will select osd.3 as a backfill peer and continue to
serve reads and writes while osd.3 is backfilled. When backfilling
is complete, <em>PG temp</em> is discarded and the acting set changes back
to [3,1,2] and osd.3 becomes the primary.</p>
</dd>
<dt><em>current interval</em> or <em>past interval</em></dt><dd><p>a sequence of OSD map epochs during which the <em>acting set</em> and <em>up
set</em> for particular PG do not change</p>
</dd>
<dt><em>primary</em></dt><dd><p>the (by convention first) member of the <em>acting set</em>,
who is responsible for coordination peering, and is
the only OSD that will accept client initiated
writes to objects in a placement group.</p>
</dd>
<dt><em>replica</em></dt><dd><p>a non-primary OSD in the <em>acting set</em> for a placement group
(and who has been recognized as such and <em>activated</em> by the primary).</p>
</dd>
<dt><em>stray</em></dt><dd><p>an OSD who is not a member of the current <em>acting set</em>, but
has not yet been told that it can delete its copies of a
particular placement group.</p>
</dd>
<dt><em>recovery</em></dt><dd><p>ensuring that copies of all of the objects in a PG
are on all of the OSDs in the <em>acting set</em>.  Once
<em>peering</em> has been performed, the primary can start
accepting write operations, and <em>recovery</em> can proceed
in the background.</p>
</dd>
<dt><em>PG info</em> basic metadata about the PG’s creation epoch, the version</dt><dd><p>for the most recent write to the PG, <em>last epoch started</em>, <em>last
epoch clean</em>, and the beginning of the <em>current interval</em>.  Any
inter-OSD communication about PGs includes the <em>PG info</em>, such that
any OSD that knows a PG exists (or once existed) also has a lower
bound on <em>last epoch clean</em> or <em>last epoch started</em>.</p>
</dd>
<dt><em>PG log</em></dt><dd><p>a list of recent updates made to objects in a PG.
Note that these logs can be truncated after all OSDs
in the <em>acting set</em> have acknowledged up to a certain
point.</p>
</dd>
<dt><em>missing set</em></dt><dd><p>Each OSD notes update log entries and if they imply updates to
the contents of an object, adds that object to a list of needed
updates.  This list is called the <em>missing set</em> for that &lt;OSD,PG&gt;.</p>
</dd>
<dt><em>Authoritative History</em></dt><dd><p>a complete, and fully ordered set of operations that, if
performed, would bring an OSD’s copy of a Placement Group
up to date.</p>
</dd>
<dt><em>epoch</em></dt><dd><p>a (monotonically increasing) OSD map version number</p>
</dd>
<dt><em>last epoch start</em></dt><dd><p>the last epoch at which all nodes in the <em>acting set</em>
for a particular placement group agreed on an
<em>authoritative history</em>.  At this point, <em>peering</em> is
deemed to have been successful.</p>
</dd>
<dt><em>up_thru</em></dt><dd><p>before a primary can successfully complete the <em>peering</em> process,
it must inform a monitor that is alive through the current
OSD map epoch by having the monitor set its <em>up_thru</em> in the osd
map.  This helps peering ignore previous <em>acting sets</em> for which
peering never completed after certain sequences of failures, such as
the second interval below:</p>
<ul class="simple">
<li><p><em>acting set</em> = [A,B]</p></li>
<li><p><em>acting set</em> = [A]</p></li>
<li><p><em>acting set</em> = [] very shortly after (e.g., simultaneous failure, but staggered detection)</p></li>
<li><p><em>acting set</em> = [B] (B restarts, A does not)</p></li>
</ul>
</dd>
<dt><em>last epoch clean</em></dt><dd><p>the last epoch at which all nodes in the <em>acting set</em>
for a particular placement group were completely
up to date (both PG logs and object contents).
At this point, <em>recovery</em> is deemed to have been
completed.</p>
</dd>
</dl>
</div>
<div class="section" id="id3">
<h2>互联过程描述<a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h2>
<p>The <em>Golden Rule</em> is that no write operation to any PG
is acknowledged to a client until it has been persisted
by all members of the <em>acting set</em> for that PG.  This means
that if we can communicate with at least one member of
each <em>acting set</em> since the last successful <em>peering</em>, someone
will have a record of every (acknowledged) operation
since the last successful <em>peering</em>.
This means that it should be possible for the current
primary to construct and disseminate a new <em>authoritative history</em>.</p>
<p>It is also important to appreciate the role of the OSD map
(list of all known OSDs and their states, as well as some
information about the placement groups) in the <em>peering</em>
process:</p>
<blockquote>
<div><p>When OSDs go up or down (or get added or removed)
this has the potential to affect the <em>active sets</em>
of many placement groups.</p>
<p>Before a primary successfully completes the <em>peering</em>
process, the OSD map must reflect that the OSD was alive
and well as of the first epoch in the <em>current interval</em>.</p>
<p>Changes can only be made after successful <em>peering</em>.</p>
</div></blockquote>
<p>Thus, a new primary can use the latest OSD map along with a recent
history of past maps to generate a set of <em>past intervals</em> to
determine which OSDs must be consulted before we can successfully
<em>peer</em>.  The set of past intervals is bounded by <em>last epoch started</em>,
the most recent <em>past interval</em> for which we know <em>peering</em> completed.
The process by which an OSD discovers a PG exists in the first place is
by exchanging <em>PG info</em> messages, so the OSD always has some lower
bound on <em>last epoch started</em>.</p>
<p>The high level process is for the current PG primary to:</p>
<blockquote>
<div><ol class="arabic">
<li><p>get a recent OSD map (to identify the members of the all
interesting <em>acting sets</em>, and confirm that we are still the
primary).</p></li>
<li><p>generate a list of <em>past intervals</em> since <em>last epoch started</em>.
Consider the subset of those for which <em>up_thru</em> was greater than
the first interval epoch by the last interval epoch’s OSD map; that is,
the subset for which <em>peering</em> could have completed before the <em>acting
set</em> changed to another set of OSDs.</p>
<p>Successful <em>peering</em> will require that we be able to contact at
least one OSD from each of <em>past interval</em>’s <em>acting set</em>.</p>
</li>
<li><p>ask every node in that list for its <em>PG info</em>, which includes the most
recent write made to the PG, and a value for <em>last epoch started</em>.  If
we learn about a <em>last epoch started</em> that is newer than our own, we can
prune older <em>past intervals</em> and reduce the peer OSDs we need to contact.</p></li>
<li><p>if anyone else has (in its PG log) operations that I do not have,
instruct them to send me the missing log entries so that the primary’s
<em>PG log</em> is up to date (includes the newest write)..</p></li>
<li><p>for each member of the current <em>acting set</em>:</p>
<ol class="loweralpha">
<li><p>ask it for copies of all PG log entries since <em>last epoch start</em>
so that I can verify that they agree with mine (or know what
objects I will be telling it to delete).</p>
<p>If the cluster failed before an operation was persisted by all
members of the <em>acting set</em>, and the subsequent <em>peering</em> did not
remember that operation, and a node that did remember that
operation later rejoined, its logs would record a different
(divergent) history than the <em>authoritative history</em> that was
reconstructed in the <em>peering</em> after the failure.</p>
<p>Since the <em>divergent</em> events were not recorded in other logs
from that <em>acting set</em>, they were not acknowledged to the client,
and there is no harm in discarding them (so that all OSDs agree
on the <em>authoritative history</em>).  But, we will have to instruct
any OSD that stores data from a divergent update to delete the
affected (and now deemed to be apocryphal) objects.</p>
</li>
<li><p>ask it for its <em>missing set</em> (object updates recorded
in its PG log, but for which it does not have the new data).
This is the list of objects that must be fully replicated
before we can accept writes.</p></li>
</ol>
</li>
<li><p>at this point, the primary’s PG log contains an <em>authoritative history</em> of
the placement group, and the OSD now has sufficient
information to bring any other OSD in the <em>acting set</em> up to date.</p></li>
<li><p>if the primary’s <em>up_thru</em> value in the current OSD map is not greater than
or equal to the first epoch in the <em>current interval</em>, send a request to the
monitor to update it, and wait until receive an updated OSD map that reflects
the change.</p></li>
<li><p>for each member of the current <em>acting set</em>:</p>
<ol class="loweralpha simple">
<li><p>send them log updates to bring their PG logs into agreement with
my own (<em>authoritative history</em>) … which may involve deciding
to delete divergent objects.</p></li>
<li><p>await acknowledgment that they have persisted the PG log entries.</p></li>
</ol>
</li>
<li><p>at this point all OSDs in the <em>acting set</em> agree on all of the meta-data,
and would (in any future <em>peering</em>) return identical accounts of all
updates.</p>
<ol class="loweralpha">
<li><p>start accepting client write operations (because we have unanimous
agreement on the state of the objects into which those updates are
being accepted).  Note, however, that if a client tries to write to an
object it will be promoted to the front of the recovery queue, and the
write willy be applied after it is fully replicated to the current <em>acting set</em>.</p></li>
<li><p>update the <em>last epoch started</em> value in our local <em>PG info</em>, and instruct
other <em>active set</em> OSDs to do the same.</p></li>
<li><p>start pulling object data updates that other OSDs have, but I do not.  We may
need to query OSDs from additional <em>past intervals</em> prior to <em>last epoch started</em>
(the last time <em>peering</em> completed) and following <em>last epoch clean</em> (the last epoch that
recovery completed) in order to find copies of all objects.</p></li>
<li><p>start pushing object data updates to other OSDs that do not yet have them.</p>
<p>We push these updates from the primary (rather than having the replicas
pull them) because this allows the primary to ensure that a replica has
the current contents before sending it an update write.  It also makes
it possible for a single read (from the primary) to be used to write
the data to multiple replicas.  If each replica did its own pulls,
the data might have to be read multiple times.</p>
</li>
</ol>
</li>
<li><p>once all replicas store the all copies of all objects (that
existed prior to the start of this epoch) we can update <em>last
epoch clean</em> in the <em>PG info</em>, and we can dismiss all of the
<em>stray</em> replicas, allowing them to delete their copies of objects
for which they are no longer in the <em>acting set</em>.</p>
<p>We could not dismiss the <em>strays</em> prior to this because it was possible
that one of those <em>strays</em> might hold the sole surviving copy of an
old object (all of whose copies disappeared before they could be
replicated on members of the current <em>acting set</em>).</p>
</li>
</ol>
</div></blockquote>
</div>
<div class="section" id="id4">
<h2>状态机模型<a class="headerlink" href="#id4" title="Permalink to this headline">¶</a></h2>
<div class="graphviz"><object data="../../_images/graphviz-253db4a80213209e3afc407333328e65a7975a81.svg" type="image/svg+xml" class="graphviz">
<p class="warning">digraph G {
	size=&quot;7,7&quot;
	compound=true;
	subgraph cluster0 {
		label = &quot;PeeringMachine&quot;;
		color = &quot;black&quot;;
		Crashed;
		Initial[shape=Mdiamond style=filled fillcolor=lightgrey];
		Reset;
		subgraph cluster1 {
			label = &quot;Started&quot;;
			color = &quot;black&quot;;
			Start[shape=Mdiamond style=filled fillcolor=lightgrey];
			subgraph cluster2 {
				label = &quot;Primary&quot;;
				color = &quot;black&quot;;
				WaitActingChange;
				subgraph cluster3 {
					label = &quot;Peering&quot;;
					color = &quot;black&quot;;
					style = &quot;filled&quot;;
					fillcolor = &quot;lightgrey&quot;;
					GetInfo[shape=Mdiamond style=filled fillcolor=lightgrey];
					GetLog;
					GetMissing;
					WaitUpThru;
					Down;
					Incomplete;
				}
				subgraph cluster4 {
					label = &quot;Active&quot;;
					color = &quot;black&quot;;
					Clean;
					Recovered;
					Backfilling;
					WaitRemoteBackfillReserved;
					WaitLocalBackfillReserved;
					NotBackfilling;
					NotRecovering;
					Recovering;
					WaitRemoteRecoveryReserved;
					WaitLocalRecoveryReserved;
					Activating[shape=Mdiamond style=filled fillcolor=lightgrey];
				}
			}
			subgraph cluster5 {
				label = &quot;ReplicaActive&quot;;
				color = &quot;black&quot;;
				RepRecovering;
				RepWaitBackfillReserved;
				RepWaitRecoveryReserved;
				RepNotRecovering[shape=Mdiamond style=filled fillcolor=lightgrey];
			}
			Stray;
			subgraph cluster6 {
				label = &quot;ToDelete&quot;;
				color = &quot;black&quot;;
				WaitDeleteReserved[shape=Mdiamond style=filled fillcolor=lightgrey];
				Deleting;
			}
		}
	}
Initial -&gt; Reset [label=&quot;Initialize&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
Initial -&gt; Crashed [label=&quot;boost::statechart::event_base&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
Reset -&gt; Crashed [label=&quot;boost::statechart::event_base&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
Start -&gt; Crashed [label=&quot;boost::statechart::event_base&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,ltail=cluster1,];
Start -&gt; GetInfo [label=&quot;MakePrimary&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,lhead=cluster2,];
Start -&gt; Stray [label=&quot;MakeStray&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,];
GetInfo -&gt; Activating [label=&quot;Activate&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,ltail=cluster3,lhead=cluster4,];
Clean -&gt; WaitLocalRecoveryReserved [label=&quot;DoRecovery&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
Recovered -&gt; WaitLocalRecoveryReserved [label=&quot;DoRecovery&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
NotRecovering -&gt; WaitLocalRecoveryReserved [label=&quot;DoRecovery&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
Activating -&gt; WaitLocalRecoveryReserved [label=&quot;DoRecovery&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,];
Recovered -&gt; Clean [label=&quot;GoClean&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,];
WaitRemoteBackfillReserved -&gt; Backfilling [label=&quot;AllBackfillsReserved&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,];
WaitLocalBackfillReserved -&gt; WaitRemoteBackfillReserved [label=&quot;LocalBackfillReserved&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,];
NotBackfilling -&gt; WaitLocalBackfillReserved [label=&quot;RequestBackfill&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
Activating -&gt; WaitLocalBackfillReserved [label=&quot;RequestBackfill&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
Recovering -&gt; WaitLocalBackfillReserved [label=&quot;RequestBackfill&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
RepNotRecovering -&gt; WaitDeleteReserved [label=&quot;DeleteStart&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,ltail=cluster5,lhead=cluster6,];
Stray -&gt; WaitDeleteReserved [label=&quot;DeleteStart&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,lhead=cluster6,];
RepRecovering -&gt; RepNotRecovering [label=&quot;RecoveryDone&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,];
RepNotRecovering -&gt; RepNotRecovering [label=&quot;RecoveryDone&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,];
RepRecovering -&gt; RepNotRecovering [label=&quot;RemoteReservationRejectedTooFull&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
RepNotRecovering -&gt; RepNotRecovering [label=&quot;RemoteReservationRejectedTooFull&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
WaitRemoteBackfillReserved -&gt; NotBackfilling [label=&quot;RemoteReservationRejectedTooFull&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
RepWaitBackfillReserved -&gt; RepNotRecovering [label=&quot;RemoteReservationRejectedTooFull&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,];
RepRecovering -&gt; RepNotRecovering [label=&quot;RemoteReservationCanceled&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,];
RepNotRecovering -&gt; RepNotRecovering [label=&quot;RemoteReservationCanceled&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,];
RepWaitRecoveryReserved -&gt; RepNotRecovering [label=&quot;RemoteReservationCanceled&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,];
RepWaitBackfillReserved -&gt; RepNotRecovering [label=&quot;RemoteReservationCanceled&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
WaitRemoteRecoveryReserved -&gt; Recovering [label=&quot;AllRemotesReserved&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
WaitLocalRecoveryReserved -&gt; WaitRemoteRecoveryReserved [label=&quot;LocalRecoveryReserved&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
Activating -&gt; Recovered [label=&quot;AllReplicasRecovered&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,];
Recovering -&gt; Recovered [label=&quot;AllReplicasRecovered&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,];
WaitDeleteReserved -&gt; Deleting [label=&quot;DeleteReserved&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,];
Deleting -&gt; WaitDeleteReserved [label=&quot;DeleteInterrupted&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,];
GetInfo -&gt; GetLog [label=&quot;GotInfo&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
GetInfo -&gt; Down [label=&quot;IsDown&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
GetLog -&gt; WaitActingChange [label=&quot;NeedActingChange&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
GetLog -&gt; Incomplete [label=&quot;IsIncomplete&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,];
GetMissing -&gt; WaitUpThru [label=&quot;NeedUpThru&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,];
Initial -&gt; GetInfo [label=&quot;MNotifyRec&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,lhead=cluster2,];
Down -&gt; GetInfo [label=&quot;MNotifyRec&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,];
Incomplete -&gt; GetLog [label=&quot;MNotifyRec&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
Initial -&gt; Stray [label=&quot;MInfoRec&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
Stray -&gt; RepNotRecovering [label=&quot;MInfoRec&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,lhead=cluster5,];
Initial -&gt; Stray [label=&quot;MLogRec&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,];
Stray -&gt; RepNotRecovering [label=&quot;MLogRec&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,lhead=cluster5,];
Start -&gt; Reset [label=&quot;AdvMap&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,ltail=cluster1,];
GetInfo -&gt; Reset [label=&quot;AdvMap&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,ltail=cluster3,];
GetLog -&gt; Reset [label=&quot;AdvMap&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
WaitActingChange -&gt; Reset [label=&quot;AdvMap&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
Incomplete -&gt; Reset [label=&quot;AdvMap&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
Reset -&gt; Start [label=&quot;ActMap&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,lhead=cluster1,];
WaitDeleteReserved -&gt; WaitDeleteReserved [label=&quot;ActMap&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,ltail=cluster6,lhead=cluster6,];
Backfilling -&gt; Recovered [label=&quot;Backfilled&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,];
Backfilling -&gt; NotBackfilling [label=&quot;DeferBackfill&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,];
Backfilling -&gt; NotBackfilling [label=&quot;UnfoundBackfill&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
Backfilling -&gt; NotBackfilling [label=&quot;RemoteReservationRevokedTooFull&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
Backfilling -&gt; WaitLocalBackfillReserved [label=&quot;RemoteReservationRevoked&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
WaitRemoteBackfillReserved -&gt; NotBackfilling [label=&quot;RemoteReservationRevoked&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,];
RepWaitRecoveryReserved -&gt; RepRecovering [label=&quot;RemoteRecoveryReserved&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,];
RepNotRecovering -&gt; RepWaitBackfillReserved [label=&quot;RequestBackfillPrio&quot;,color=&quot;#40e0d0&quot;,fontcolor=&quot;#40e0d0&quot;,];
RepNotRecovering -&gt; RepWaitRecoveryReserved [label=&quot;RequestRecoveryPrio&quot;,color=&quot;#c71585&quot;,fontcolor=&quot;#c71585&quot;,];
RepWaitBackfillReserved -&gt; RepRecovering [label=&quot;RemoteBackfillReserved&quot;,color=&quot;#000000&quot;,fontcolor=&quot;#000000&quot;,];
WaitLocalRecoveryReserved -&gt; NotRecovering [label=&quot;RecoveryTooFull&quot;,color=&quot;#1e90ff&quot;,fontcolor=&quot;#1e90ff&quot;,];
Recovering -&gt; NotRecovering [label=&quot;DeferRecovery&quot;,color=&quot;#ff0000&quot;,fontcolor=&quot;#ff0000&quot;,];
Recovering -&gt; NotRecovering [label=&quot;UnfoundRecovery&quot;,color=&quot;#0000ff&quot;,fontcolor=&quot;#0000ff&quot;,];
GetLog -&gt; GetMissing [label=&quot;GotLog&quot;,color=&quot;#ffa500&quot;,fontcolor=&quot;#ffa500&quot;,];
}</p></object></div>
</div>
</div>



           </div>
           
          </div>
          <footer>
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
        <a href="../perf/" class="btn btn-neutral float-right" title="Using perf" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
        <a href="../osd-class-path/" class="btn btn-neutral float-left" title="OSD class path issues" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>
        &#169; Copyright 2016, Ceph authors and contributors. Licensed under Creative Commons Attribution Share Alike 3.0 (CC-BY-SA-3.0).

    </p>
  </div> 

</footer>
        </div>
      </div>

    </section>

  </div>
  

  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

  
  
    
   

</body>
</html>